{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this tutorial we'll demonstrate Coach's hierarchical RL support, by building a new agent that implements the Hierarchical Actor Critic (HAC) algorithm (https://arxiv.org/pdf/1712.00948.pdf), and a preset that runs the agent on Mujoco's pendulum challenge."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The Agent"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, some imports. Note that HAC is based on DDPG, hence we will be importing the relevant classes.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "module_path = os.path.abspath(os.path.join('..'))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
    "    sys.path.append(module_path + '/rl_coach')\n",
    "    \n",
    "from typing import Union\n",
    "import numpy as np\n",
    "from rl_coach.agents.ddpg_agent import DDPGAgent, DDPGAgentParameters, DDPGAlgorithmParameters\n",
    "from rl_coach.spaces import SpacesDefinition\n",
    "from rl_coach.core_types import RunPhase"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's define the HAC algorithm and agent parameters.\n",
    "\n",
    "See tutorial 1 for more details on the content of each of these classes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class HACDDPGAlgorithmParameters(DDPGAlgorithmParameters):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.sub_goal_testing_rate = 0.5\n",
    "        self.time_limit = 40\n",
    "\n",
    "\n",
    "class HACDDPGAgentParameters(DDPGAgentParameters):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.algorithm = DDPGAlgorithmParameters()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we'll define the agent itself - ```HACDDPGAgent``` - which subclasses the DDPG agent class. The main difference between the DDPG agent and the HACDDPGAgent is the subgoal a higher level agent defines to a lower level agent, hence the overrides of the DDPG Agent functions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class HACDDPGAgent(DDPGAgent):\n",
    "    def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):\n",
    "        super().__init__(agent_parameters, parent)\n",
    "        self.sub_goal_testing_rate = self.ap.algorithm.sub_goal_testing_rate\n",
    "        self.graph_manager = None\n",
    "\n",
    "    def choose_action(self, curr_state):\n",
    "        # top level decides, for each of his generated sub-goals, for all the layers beneath him if this is a sub-goal\n",
    "        # testing phase\n",
    "        graph_manager = self.parent_level_manager.parent_graph_manager\n",
    "        if self.ap.is_a_highest_level_agent:\n",
    "            graph_manager.should_test_current_sub_goal = np.random.rand() < self.sub_goal_testing_rate\n",
    "\n",
    "        if self.phase == RunPhase.TRAIN:\n",
    "            if graph_manager.should_test_current_sub_goal:\n",
    "                self.exploration_policy.change_phase(RunPhase.TEST)\n",
    "            else:\n",
    "                self.exploration_policy.change_phase(self.phase)\n",
    "\n",
    "        action_info = super().choose_action(curr_state)\n",
    "        return action_info\n",
    "\n",
    "    def update_transition_before_adding_to_replay_buffer(self, transition):\n",
    "        graph_manager = self.parent_level_manager.parent_graph_manager\n",
    "\n",
    "        # deal with goals given from a higher level agent\n",
    "        if not self.ap.is_a_highest_level_agent:\n",
    "            transition.state['desired_goal'] = self.current_hrl_goal\n",
    "            transition.next_state['desired_goal'] = self.current_hrl_goal\n",
    "            self.distance_from_goal.add_sample(self.spaces.goal.distance_from_goal(\n",
    "                self.current_hrl_goal, transition.next_state))\n",
    "            goal_reward, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(\n",
    "                self.current_hrl_goal, transition.next_state)\n",
    "            transition.reward = goal_reward\n",
    "            transition.game_over = transition.game_over or sub_goal_reached\n",
    "\n",
    "        # each level tests its own generated sub goals\n",
    "        if not self.ap.is_a_lowest_level_agent and graph_manager.should_test_current_sub_goal:\n",
    "            _, sub_goal_reached = self.spaces.goal.get_reward_for_goal_and_state(\n",
    "                transition.action, transition.next_state)\n",
    "\n",
    "            sub_goal_is_missed = not sub_goal_reached\n",
    "\n",
    "            if sub_goal_is_missed:\n",
    "                    transition.reward = -self.ap.algorithm.time_limit\n",
    "        return transition\n",
    "\n",
    "    def set_environment_parameters(self, spaces: SpacesDefinition):\n",
    "        super().set_environment_parameters(spaces)\n",
    "\n",
    "        if self.ap.is_a_highest_level_agent:\n",
    "            # the rest of the levels already have an in_action_space set to be of type GoalsSpace, thus they will have\n",
    "            # their GoalsSpace set to the in_action_space in agent.set_environment_parameters()\n",
    "            self.spaces.goal = self.spaces.action\n",
    "            self.spaces.goal.set_target_space(self.spaces.state[self.spaces.goal.goal_name])\n",
    "\n",
    "        if not self.ap.is_a_highest_level_agent:\n",
    "            self.spaces.reward.reward_success_threshold = self.spaces.goal.reward_type.goal_reaching_reward\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The Preset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Defining the top agent in the hierarchy. Note that the agent's base parameters are the same as the DDPG agent's parameters. We also define here the memory, exploration policy and network topology."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rl_coach.architectures.tensorflow_components.layers import Dense\n",
    "from rl_coach.base_parameters import VisualizationParameters, EmbeddingMergerType, EmbedderScheme\n",
    "from rl_coach.architectures.embedder_parameters import InputEmbedderParameters\n",
    "from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \\\n",
    "    EpisodicHindsightExperienceReplayParameters\n",
    "from rl_coach.memories.episodic.episodic_hrl_hindsight_experience_replay import \\\n",
    "    EpisodicHRLHindsightExperienceReplayParameters\n",
    "from rl_coach.memories.memory import MemoryGranularity\n",
    "from rl_coach.spaces import GoalsSpace, ReachingGoal\n",
    "from rl_coach.exploration_policies.ou_process import OUProcessParameters\n",
    "from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps, RunPhase, TrainingSteps\n",
    "\n",
    "\n",
    "time_limit = 1000\n",
    "polar_coordinates = False\n",
    "distance_from_goal_threshold = np.array([0.075, 0.075, 0.75])\n",
    "goals_space = GoalsSpace('achieved_goal',\n",
    "                         ReachingGoal(default_reward=-1, goal_reaching_reward=0,\n",
    "                                      distance_from_goal_threshold=distance_from_goal_threshold),\n",
    "                         lambda goal, state: np.abs(goal - state))  # raw L1 distance\n",
    "\n",
    "top_agent_params = HACDDPGAgentParameters()\n",
    "\n",
    "# memory - Hindsight Experience Replay\n",
    "top_agent_params.memory = EpisodicHRLHindsightExperienceReplayParameters()\n",
    "top_agent_params.memory.max_size = (MemoryGranularity.Transitions, 10000000)\n",
    "top_agent_params.memory.hindsight_transitions_per_regular_transition = 3\n",
    "top_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future\n",
    "top_agent_params.memory.goals_space = goals_space\n",
    "top_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(32)\n",
    "top_agent_params.algorithm.num_consecutive_training_steps = 40\n",
    "top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)\n",
    "\n",
    "# exploration - OU process\n",
    "top_agent_params.exploration = OUProcessParameters()\n",
    "top_agent_params.exploration.theta = 0.1\n",
    "\n",
    "# actor - note that the default middleware is overriden with 3 dense layers\n",
    "top_actor = top_agent_params.network_wrappers['actor']\n",
    "top_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
    "                                        'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
    "top_actor.middleware_parameters.scheme = [Dense([64])] * 3\n",
    "top_actor.learning_rate = 0.001\n",
    "top_actor.batch_size = 4096\n",
    "\n",
    "# critic - note that the default middleware is overriden with 3 dense layers\n",
    "top_critic = top_agent_params.network_wrappers['critic']\n",
    "top_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
    "                                         'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
    "                                         'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
    "top_critic.embedding_merger_type = EmbeddingMergerType.Concat\n",
    "top_critic.middleware_parameters.scheme = [Dense([64])] * 3\n",
    "top_critic.learning_rate = 0.001\n",
    "top_critic.batch_size = 4096"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The bottom agent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rl_coach.schedules import ConstantSchedule\n",
    "from rl_coach.exploration_policies.e_greedy import EGreedyParameters\n",
    "\n",
    "\n",
    "bottom_agent_params = HACDDPGAgentParameters()\n",
    "bottom_agent_params.algorithm.in_action_space = goals_space\n",
    "\n",
    "bottom_agent_params.memory = EpisodicHindsightExperienceReplayParameters()\n",
    "bottom_agent_params.memory.max_size = (MemoryGranularity.Transitions, 12000000)\n",
    "bottom_agent_params.memory.hindsight_transitions_per_regular_transition = 4\n",
    "bottom_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future\n",
    "bottom_agent_params.memory.goals_space = goals_space\n",
    "bottom_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16 * 25)  # 25 episodes is one true env episode\n",
    "bottom_agent_params.algorithm.num_consecutive_training_steps = 40\n",
    "bottom_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40)\n",
    "\n",
    "bottom_agent_params.exploration = EGreedyParameters()\n",
    "bottom_agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2)\n",
    "bottom_agent_params.exploration.evaluation_epsilon = 0\n",
    "bottom_agent_params.exploration.continuous_exploration_policy_parameters = OUProcessParameters()\n",
    "bottom_agent_params.exploration.continuous_exploration_policy_parameters.theta = 0.1\n",
    "\n",
    "# actor\n",
    "bottom_actor = bottom_agent_params.network_wrappers['actor']\n",
    "bottom_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
    "                                           'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
    "bottom_actor.middleware_parameters.scheme = [Dense([64])] * 3\n",
    "bottom_actor.learning_rate = 0.001\n",
    "bottom_actor.batch_size = 4096\n",
    "\n",
    "# critic\n",
    "bottom_critic = bottom_agent_params.network_wrappers['critic']\n",
    "bottom_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
    "                                            'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),\n",
    "                                            'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}\n",
    "bottom_critic.embedding_merger_type = EmbeddingMergerType.Concat\n",
    "bottom_critic.middleware_parameters.scheme = [Dense([64])] * 3\n",
    "bottom_critic.learning_rate = 0.001\n",
    "bottom_critic.batch_size = 4096"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now we define the parameters of all the agents in the hierarchy from top to bottom"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "agents_params = [top_agent_params, bottom_agent_params]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define the environment, visualization and schedule parameters. The schedule parameters refer to the top level agent."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rl_coach.environments.gym_environment import Mujoco\n",
    "from rl_coach.environments.environment import SelectedPhaseOnlyDumpMethod\n",
    "from rl_coach.graph_managers.hrl_graph_manager import HRLGraphManager\n",
    "from rl_coach.graph_managers.graph_manager import ScheduleParameters\n",
    "\n",
    "\n",
    "env_params = Mujoco()\n",
    "env_params.level = \"rl_coach.environments.mujoco.pendulum_with_goals:PendulumWithGoals\"\n",
    "env_params.additional_simulator_parameters = {\"time_limit\": time_limit,\n",
    "                                              \"random_goals_instead_of_standing_goal\": False,\n",
    "                                              \"polar_coordinates\": polar_coordinates,\n",
    "                                              \"goal_reaching_thresholds\": distance_from_goal_threshold}\n",
    "env_params.frame_skip = 10\n",
    "env_params.custom_reward_threshold = -time_limit + 1\n",
    "\n",
    "vis_params = VisualizationParameters()\n",
    "vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST)]\n",
    "vis_params.dump_mp4 = False\n",
    "vis_params.native_rendering = False\n",
    "\n",
    "schedule_params = ScheduleParameters()\n",
    "schedule_params.improve_steps = EnvironmentEpisodes(40 * 4 * 64)  # 40 epochs\n",
    "schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(4 * 64)  # 4 small batches of 64 episodes\n",
    "schedule_params.evaluation_steps = EnvironmentEpisodes(64)\n",
    "schedule_params.heatup_steps = EnvironmentSteps(0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Lastly, we create a ```HRLGraphManager``` that will execute the hierarchical agent we defined according to the parameters. \n",
    "\n",
    "Note that the bottom level agent will run 40 steps on each single step of the top level agent."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph_manager = HRLGraphManager(agents_params=agents_params, env_params=env_params,\n",
    "                                schedule_params=schedule_params, vis_params=vis_params,\n",
    "                                consecutive_steps_to_run_each_level=EnvironmentSteps(40))\n",
    "graph_manager.visualization_parameters.render = True"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Running the Preset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rl_coach.base_parameters import TaskParameters, Frameworks\n",
    "\n",
    "log_path = '../experiments/pendulum_hac'\n",
    "if not os.path.exists(log_path):\n",
    "    os.makedirs(log_path)\n",
    "    \n",
    "task_parameters = TaskParameters(framework_type=Frameworks.tensorflow, \n",
    "                                evaluate_only=False,\n",
    "                                experiment_path=log_path)\n",
    "\n",
    "task_parameters.__dict__['checkpoint_save_secs'] = None\n",
    "task_parameters.__dict__['verbosity'] = 'low'\n",
    "\n",
    "graph_manager.create_graph(task_parameters)\n",
    "\n",
    "graph_manager.improve()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
